{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/abulbasar/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
      "/Users/abulbasar/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
      "  DeprecationWarning)\n",
      "/Users/abulbasar/anaconda3/lib/python3.6/site-packages/sklearn/learning_curve.py:22: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the functions are moved. This module will be removed in 0.20\n",
      "  DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import *\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from mlxtend.plotting import plot_decision_regions\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>32</th>\n",
       "      <th>33</th>\n",
       "      <th>34</th>\n",
       "      <th>35</th>\n",
       "      <th>36</th>\n",
       "      <th>37</th>\n",
       "      <th>38</th>\n",
       "      <th>39</th>\n",
       "      <th>40</th>\n",
       "      <th>41</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>215</td>\n",
       "      <td>45076</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>162</td>\n",
       "      <td>4528</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>236</td>\n",
       "      <td>1228</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>233</td>\n",
       "      <td>2032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.33</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>239</td>\n",
       "      <td>486</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0    1     2   3    4      5   6   7   8   9    ...     32   33   34    35  \\\n",
       "0   0  tcp  http  SF  215  45076   0   0   0   0   ...      0  0.0  0.0  0.00   \n",
       "1   0  tcp  http  SF  162   4528   0   0   0   0   ...      1  1.0  0.0  1.00   \n",
       "2   0  tcp  http  SF  236   1228   0   0   0   0   ...      2  1.0  0.0  0.50   \n",
       "3   0  tcp  http  SF  233   2032   0   0   0   0   ...      3  1.0  0.0  0.33   \n",
       "4   0  tcp  http  SF  239    486   0   0   0   0   ...      4  1.0  0.0  0.25   \n",
       "\n",
       "    36   37   38   39   40       41  \n",
       "0  0.0  0.0  0.0  0.0  0.0  normal.  \n",
       "1  0.0  0.0  0.0  0.0  0.0  normal.  \n",
       "2  0.0  0.0  0.0  0.0  0.0  normal.  \n",
       "3  0.0  0.0  0.0  0.0  0.0  normal.  \n",
       "4  0.0  0.0  0.0  0.0  0.0  normal.  \n",
       "\n",
       "[5 rows x 42 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"/data/kddcup.data\", header=None)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Category']\n"
     ]
    }
   ],
   "source": [
    "columns = [f.split(\":\")[0] for f in \"\"\"\n",
    "duration: continuous.\n",
    "protocol_type: symbolic.\n",
    "service: symbolic.\n",
    "flag: symbolic.\n",
    "src_bytes: continuous.\n",
    "dst_bytes: continuous.\n",
    "land: symbolic.\n",
    "wrong_fragment: continuous.\n",
    "urgent: continuous.\n",
    "hot: continuous.\n",
    "num_failed_logins: continuous.\n",
    "logged_in: symbolic.\n",
    "num_compromised: continuous.\n",
    "root_shell: continuous.\n",
    "su_attempted: continuous.\n",
    "num_root: continuous.\n",
    "num_file_creations: continuous.\n",
    "num_shells: continuous.\n",
    "num_access_files: continuous.\n",
    "num_outbound_cmds: continuous.\n",
    "is_host_login: symbolic.\n",
    "is_guest_login: symbolic.\n",
    "count: continuous.\n",
    "srv_count: continuous.\n",
    "serror_rate: continuous.\n",
    "srv_serror_rate: continuous.\n",
    "rerror_rate: continuous.\n",
    "srv_rerror_rate: continuous.\n",
    "same_srv_rate: continuous.\n",
    "diff_srv_rate: continuous.\n",
    "srv_diff_host_rate: continuous.\n",
    "dst_host_count: continuous.\n",
    "dst_host_srv_count: continuous.\n",
    "dst_host_same_srv_rate: continuous.\n",
    "dst_host_diff_srv_rate: continuous.\n",
    "dst_host_same_src_port_rate: continuous.\n",
    "dst_host_srv_diff_host_rate: continuous.\n",
    "dst_host_serror_rate: continuous.\n",
    "dst_host_srv_serror_rate: continuous.\n",
    "dst_host_rerror_rate: continuous.\n",
    "dst_host_srv_rerror_rate: continuous.\n",
    "\"\"\".split(\"\\n\") if len(f)>0]\n",
    "\n",
    "columns.append(\"Category\")\n",
    "print(columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4898431 entries, 0 to 4898430\n",
      "Data columns (total 42 columns):\n",
      "duration                       int64\n",
      "protocol_type                  object\n",
      "service                        object\n",
      "flag                           object\n",
      "src_bytes                      int64\n",
      "dst_bytes                      int64\n",
      "land                           int64\n",
      "wrong_fragment                 int64\n",
      "urgent                         int64\n",
      "hot                            int64\n",
      "num_failed_logins              int64\n",
      "logged_in                      int64\n",
      "num_compromised                int64\n",
      "root_shell                     int64\n",
      "su_attempted                   int64\n",
      "num_root                       int64\n",
      "num_file_creations             int64\n",
      "num_shells                     int64\n",
      "num_access_files               int64\n",
      "num_outbound_cmds              int64\n",
      "is_host_login                  int64\n",
      "is_guest_login                 int64\n",
      "count                          int64\n",
      "srv_count                      int64\n",
      "serror_rate                    float64\n",
      "srv_serror_rate                float64\n",
      "rerror_rate                    float64\n",
      "srv_rerror_rate                float64\n",
      "same_srv_rate                  float64\n",
      "diff_srv_rate                  float64\n",
      "srv_diff_host_rate             float64\n",
      "dst_host_count                 int64\n",
      "dst_host_srv_count             int64\n",
      "dst_host_same_srv_rate         float64\n",
      "dst_host_diff_srv_rate         float64\n",
      "dst_host_same_src_port_rate    float64\n",
      "dst_host_srv_diff_host_rate    float64\n",
      "dst_host_serror_rate           float64\n",
      "dst_host_srv_serror_rate       float64\n",
      "dst_host_rerror_rate           float64\n",
      "dst_host_srv_rerror_rate       float64\n",
      "Category                       object\n",
      "dtypes: float64(15), int64(23), object(4)\n",
      "memory usage: 1.5+ GB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df.select_dtypes(include=[np.float64, np.int64]).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "smurf.              2807886\n",
       "neptune.            1072017\n",
       "normal.              972781\n",
       "satan.                15892\n",
       "ipsweep.              12481\n",
       "portsweep.            10413\n",
       "nmap.                  2316\n",
       "back.                  2203\n",
       "warezclient.           1020\n",
       "teardrop.               979\n",
       "pod.                    264\n",
       "guess_passwd.            53\n",
       "buffer_overflow.         30\n",
       "land.                    21\n",
       "warezmaster.             20\n",
       "imap.                    12\n",
       "rootkit.                 10\n",
       "loadmodule.               9\n",
       "ftp_write.                8\n",
       "multihop.                 7\n",
       "phf.                      4\n",
       "perl.                     3\n",
       "spy.                      2\n",
       "Name: Category, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = df.Category\n",
    "y.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train (3428901, 38) X_test (1469530, 38)\n",
      "Training accuracy: 0.9972489727758252 Test accuracy: 0.9971732458677264\n",
      "CPU times: user 4min 38s, sys: 16.6 s, total: 4min 54s\n",
      "Wall time: 4min 56s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "y = np.where(df.Category == \"normal.\", 0, 1)\n",
    "X = df.select_dtypes(include=[np.float64, np.int64]).values\n",
    "\n",
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, \n",
    "                                        test_size = 0.3, random_state = 12345)\n",
    "\n",
    "scaler = preprocessing.StandardScaler()\n",
    "X_train_std = scaler.fit_transform(X_train)\n",
    "X_test_std = scaler.transform(X_test)\n",
    "\n",
    "print(\"X_train\", X_train.shape, \"X_test\", X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.801592\n",
       "0    0.198408\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(y_train).value_counts()/len(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12.4 s, sys: 6.14 s, total: 18.5 s\n",
      "Wall time: 15.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "pca = decomposition.PCA(random_state=1)\n",
    "pca.fit(X_train_std)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0,0.5,'Variance retention')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XecVPW9//HXZ5cm3aUqHQFpUnSF\nWGJFRRPbvWo0JpaomESj4k3ReH+23HhjizGJ5WKJJRo1WIIRRWNAsQuCwoKFtrAgHZaybJ3P7485\nO47rsjvAnj0zO+/n47GPPefM95x5c9idz572/Zq7IyIiApATdQAREUkfKgoiIpKgoiAiIgkqCiIi\nkqCiICIiCSoKIiKSoKIgIiIJKgoiIpKgoiAiIgnNog6wqzp37ux9+/aNOoaISEaZPXv2enfvUl+7\njCsKffv2ZdasWVHHEBHJKGZWmEo7nT4SEZEEFQUREUlQURARkQQVBRERSVBREBGRhNCKgpk9bGZr\nzWz+Tl43M/ujmS0ys0/M7MCwsoiISGrCPFJ4BBhfx+snAgODrwnAfSFmERGRFIT2nIK7v2lmfeto\ncirwmMfHA33PzDqa2T7u/mVYmUREanJ33MGrpxPL4au5na0bejxa5OaQk2Phv1EgyofXegArkuaL\ngmUqCiIRcXe2llWybmsZ67aWUVJeSVlFjLLKGGWVVfHvFV9Nl1fFqKxyqmJORTBdEYtRFXMqq5zK\nWIyYQyzmVHm8Xaz6e4z4tPtXbYLXY8nLqudjfPN1d2Kx4KM7+GCPJT7ka3zY1/jgz5Th6V+64nCG\n7duh0d4vyqJQW+mr9b/JzCYQP8VE7969w8wk0iS5O8U7Kli9pZQvi0tZU1zK6i2liQ//ddvKEtNl\nlbGUtmkGzXNzaJ5jNMvNoVmO0SzXaJaTE3yPT+fkGDkGuTlGjhm5OUauGTk50Dwnhxyz4OurNtXt\nLFiWa4aZkZtD/PXEsvh8dZ4cMyyYtsR0vF1iOV/Nk9yer7cjWDeV/fC1+Vo/2nZf13atGnR79Ymy\nKBQBvZLmewKramvo7pOASQD5+fkZUt9FGo+7s2F7OYUbtlO4oYRlG0pYvmE7q4pLWbOllNXFpbV+\n2Oe1aUHXdi3p0q4l/Tq1oUsw3aVdSzq3bUnbls1o2TyHls1yadksJ/7VPD7dLMdS+tCUzBJlUZgC\nXG5mTwFjgWJdTxCpm7uzcvMO5hUVM39VMUvXb2fZ+hKWbyxhW1llol2OwT4d9qJHx70Y0bMjxw9t\nSbf2rdinw1507xCf7tquFS2a6a50+brQioKZ/Q04CuhsZkXADUBzAHe/H5gKnAQsAkqAC8PKIpKJ\n3J01W8r4pGgz81YW80lRMfNWFrNxezkAzXKM3p1a0yevNWP65dGnU2v6dmpDn06t6bl3a33gy24J\n8+6jc+p53YHLwnp/kUxTXhmjYFUxsws3MWvZJmYv38S6rWVA/Lz6oG7tOG5INw7o2YERPTuwf/d2\ntGyWG3FqaWoyrutskaaieEcFHxVuYlbhRmYt28THRZsprYif9++VtxeHD+jMqF4dOaBnB4bu055W\nzVUAJHwqCiKNZEd5FbMKN/LWovW8s2gD81cV4x4/Chi2b3vOGdObg/vmkd9nb7q2b9w7TkSqqSiI\nhKSyKsa8lcW8vWg9by/awOzCTZRXxWiea4zutTdXHjuQMf3yGNWrI61b6FdR0oN+EkUaWHFJBQ+9\ntYRH3y2keEcFAEP2ac/5h/bh0AGdGdM3jzYt9asn6Uk/mSINpHhHBQ+/tZSH317K1tJKxg/rzndH\n7sMh/TvRqW3LqOOJpERFQWQPbS2t4C9vL+PBmUvYEhSDK8cNZMg+7aOOJrLLVBREdtO2skoefWcZ\nD8xcwuaSCsYN6cZV4wYyvEfj9VMj0tBUFER20faySh57t5BJby5mU0kFxwzuylXjBjKiZ8eoo4ns\nMRUFkRTtKK/ir+8Vcv8bi9mwvZwjBnXh6uMGMaqXioE0HSoKIvUorajiyfeXc++MxazfVsbhAzoz\n8biBHNQnL+poIg1ORUFkJ8oqq3jqgxXcO2MRa7aUcUj/Ttx77oGM6adiIE2XioJIDbGY8/ycldzx\n6md8WVzKmL553PW9URy6X+eoo4mETkVBJMmc5Zu46cUFzF2xmZE9O3D7GSM5bEAnjRsgWUNFQQRY\ns6WUW1/5lOc+WkmXdi2588yRnD66R6OOjSuSDlQUJKuVVlTx0FtLuWf6IiqrnJ8ctR+XHT2AtuqG\nQrKUfvIlK7k7ry5Yw29fWsjyjSUcP7Qb131nCH06tYk6mkikVBQk6yxdv50bphTw5ufrGNStLX+9\naCyHD9RFZBFQUZAsUlpRxb0zFnP/jMW0bJbD9d8dynmH9KFZroatFKmmoiBZYfqna7lhSgHLN5Zw\n6qh9ue6kIRrIRqQWKgrSpK3cvIObXyxgWsEa9uvShicvHsuhA3SqSGRnVBSkSaqoivHQW0u5+19f\n4Di/HL8/Fx/enxbNdKpIpC4qCtLkfLxiM7969hM+Xb2V44d24/qTh9Jz79ZRxxLJCCoK0mTsKK/i\nrn99zoMzl9ClXUseOC+f44Z2izqWSEZRUZAm4Z3F67n2uXkUbijh+2N7c82Jg2nfqnnUsUQyjoqC\nZLTiHRX879SFPPXhCvp0as3fLvkWh+zXKepYIhlLRUEy1qsFq/nvF+azflsZlx7Rn6vGDWKvFrlR\nxxLJaCoKknE2bS/n+ikFvPjxKgZ3b8eD5+drKEyRBqKiIBnlXwvWcO3z89i0vZyJ4wbx06P3o7me\nSBZpMCoKkhG2lFZw84sLmDy7iMHd2/HIhQczbN8OUccSaXJUFCTtvfn5On717Ces3VrG5UcP4Ipj\nB+ohNJGQqChI2tpWVsktUxfy5PvLGdC1Lc/94CBG9tK1A5EwqShIWnp/yQZ+PvljijbtYMIR/bn6\nuEG0aq47i0TCFuoxuJmNN7PPzGyRmV1Ty+u9zWy6mc0xs0/M7KQw80j6K6us4n+nLuTsB94jx4y/\nX3oIvz5piAqCSCMJ7UjBzHKBe4DjgCLgQzOb4u4Lkpr9N/CMu99nZkOBqUDfsDJJelv45RYmPj2X\nT1dv5dyxvbnuO0No3UIHsyKNKczfuDHAIndfAmBmTwGnAslFwYH2wXQHYFWIeSRNVcWch95awh3T\nPqf9Xs15+IJ8jhmsPotEohBmUegBrEiaLwLG1mhzI/Cqmf0MaAOMCzGPpKGiTSX81zMf8/7SjZww\nrBu3nH4Andq2jDqWSNYKsyhYLcu8xvw5wCPufqeZHQI8bmbD3T32tQ2ZTQAmAPTu3TuUsNK43J3n\nPlrJDVMKALjjzJH854E9MKvtx0ZEGkuYRaEI6JU035Nvnh66CBgP4O7vmlkroDOwNrmRu08CJgHk\n5+fXLCySYTZuL+e65+fx8vzVjOmbx51njaRXnsY7EEkHYRaFD4GBZtYPWAmcDXy/RpvlwLHAI2Y2\nBGgFrAsxk0Rs+mdr+eXkT9hcUs41Jw7mkm/3JzdHRwci6SK0ouDulWZ2OTANyAUedvcCM7sZmOXu\nU4D/Ah4ws4nETy1d4O46EmiCSsor+e1LC3ni/eUM7t6ORy8cw9B929e/oog0qlDv93P3qcRvM01e\ndn3S9ALgsDAzSPTmLN/ExKfnUrixRA+iiaQ53QQuoamoivGn17/gnhmL6d6+FU9erAFwRNKdioKE\nYtHabUx8ei7zVhbzHwf24MZThml4TJEMoKIgDcrdeezdQm6ZupDWLXK579wDOfGAfaKOJSIpUlGQ\nBrNmSyk///vHzPxiPUcO6sLtZ4yga/tWUccSkV2goiAN4qVPvuTXz8+jvDLG/5w2nHPH9taDaCIZ\nSEVB9kjxjgpunFLA83NWMrJXR+46ayT9u7SNOpaI7CYVBdlt7yxez8+f+Zg1W8uYOG4Qlx29H800\nXrJIRkupKATdYHdLbu/uy8MKJemttKKKO6Z9xoNvLaV/5zY895NDNSKaSBNRb1EIejC9AVgDVHdU\n58CIEHNJmipYVczEp+fy+Zpt/PBbffj1SUPYq4UeRBNpKlI5UrgS2N/dN4QdRtJXVcyZ9OYSfv/a\nZ3Rs3YJHLjyYo/bvGnUsEWlgqRSFFUBx2EEkfa3YWMLVz8zlw2WbOHF4d245/QD2btMi6lgiEoJU\nisISYIaZvQSUVS9099+HlkrSgrvz99lF3DSlgBwzfn/WSE4frTEPRJqyVIrC8uCrRfAlWWDDtjKu\nfW4ery5Yw9h+8TEPeu6tMQ9Emrp6i4K73wRgZu3is74t9FQSqX9/uoZfTp7Hlh0VXHfSEC46vB85\nGvNAJCukcvfRcOBxIC+YXw+c5+4FIWeTRlZzzIPHLxrDkH005oFINknl9NEk4Gp3nw5gZkcBDwCH\nhphLGtncFZuZ+PRclm3YrjEPRLJYKkWhTXVBAHD3GWbWJsRM0ogqq2LcM30xf/z3F3Rr11JjHohk\nuZTuPjKz/0f8FBLAD4Cl4UWSxrJs/XYmPjOXOcs3c9qofbnp1OF02EtjHohks1SKwo+Am4DnAAPe\nBC4MM5SEy915+sMV3PzPBTTLMf50zmhOHrlv1LFEJA2kcvfRJuCKRsgijWBzSTnXPjePl+ev5tD9\nOnHnWSPZp8NeUccSkTSx06JgZn9w96vM7EXifR19jbufEmoyaXDvL9nAVU/PZd3WMq49cTCXfLu/\nbjUVka+p60ih+hrCHY0RRMJTWRXj7te/4J7pi+id15rnfnooI3qqV1MR+aadFgV3nx1MjnL3u5Nf\nM7MrgTfCDCYNY8XGEq58ag4fLd/MGQf15MZThtG2pYbREJHapTIiyvm1LLuggXNICKZ8vIqT7p7J\nF2u28cdzRnPHmSNVEESkTnVdUzgH+D7Qz8ymJL3UDlA32mlse1klN0wpYPLsIg7s3ZG7zx5Nrzz1\nWyQi9avrz8Z3gC+BzsCdScu3Ap+EGUp23/yVxVzxtzks3bCdK44ZwBXHDtQQmSKSsrquKRQChcAh\njRdHdpe785e3l/G7lz8lr00LPZksIrsllQ7x/gO4FehK/OE1I95bqnpKSxMbt5fzi79/zOufrmXc\nkK7cdsZI8jQIjojshlSuOt4GnOzuC8MOI7vuncXrmfj0XDZtr+CGk4dywaF9NQiOiOy2VIrCGhWE\n9FP97MGfpy+iX6c2PHT+wQzv0SHqWCKS4VIpCrPM7GngBb4+HOdzoaWSOn1ZvIOfPTmHWYWbOOOg\nntx0yjDa6FZTEWkAqXyStAdKgOOTljnxDvKkkU3/bC1XPz2X8soYf/jeKE4b3SPqSCLShKTSIZ56\nRE0DlVUxfv/a59w7YzGDu7fjnnMPZL8ubaOOJSJNTL03sJvZIDN73czmB/MjzOy/U9m4mY03s8/M\nbJGZXbOTNmeZ2QIzKzCzJ3ctfnZYXVzK9x94n3tnLOacMb144bLDVBBEJBSpnD56APgF8H8A7v5J\n8OH9P3WtZGa5wD3AcUAR8KGZTXH3BUltBgLXAoe5+yYz67p7/4ym643P1zHx6bmUVlTpdJGIhC6V\notDa3T+ocZtjZQrrjQEWufsSADN7CjgVWJDU5hLgnmDMBtx9bUqps0BlVYw//OsL7pmxiEFd46eL\nBnTV0YGIhCuVorDezPYjGFPBzM4g3v1FfXoAK5Lmi4CxNdoMCrb5NpAL3Ojur6Sw7SatpLySix+d\nxTuLN3BWfk9uOmU4e7XIjTqWiGSBVIrCZcAkYLCZrSQ+PvO5KaxX2xNUNQfraQYMBI4CegIzzWy4\nu2/+2obMJgATAHr37p3CW2eu0ooqJjw2m/eWbOC2M0ZwVn6vqCOJSBZJpac0d/dxQBdgsLsfnuJ6\nRUDyJ1pPYFUtbf7h7hXuvhT4jHiRqBlgkrvnu3t+ly5dUnjrzFRRFeNnf5vDW4vWc9sZI1UQRKTR\npfLh/iyAu293963BsskprPchMNDM+plZC+BsYEqNNi8ARwOYWWfip5OWpBK8qYnFnF9O/oTXFqzh\nxpOHcsZBPaOOJCJZqK7xFAYDw4AOQad41doDrerbsLtXmtnlwDTi1wsedvcCM7sZmOXuU4LXjjez\nBUAV8At3z7qxGtyd66fM5/k5K/nFCftzwWH9oo4kIlmqrmsK+wPfBToCJyct30r8rqF6uftUYGqN\nZdcnTTtwdfCVtW595TP++t5yLj2yPz89ar+o44hIFqtrPIV/AP8ws0Pc/d1GzJRV7pm+iPvfWMy5\nY3tzzfjB6uFURCKVyjWFDbv7RLPU7dF3lnH7tM84bdS+/ObU4SoIIhK5VIrCA8SfOq6A+BPNxC8a\nyx6YPLuIG6YUcNzQbtx+5khyclQQRCR6qRSF1u7+QY1lqTzRLDvxzuL1/OrZTzhsQCf+dM5ommsM\nZRFJE6l8Gu3uE81Si5Wbd3D5k3Po26k19//gIFo115PKIpI+wnyiWWooraji0sdnUVEZY9J5+bRr\n1TzqSCIiX1NnUTCzHCDf3ceZWRsgJ+kBNtkF7s6vn5/H/JVbePC8fHV9LSJpqc7TR+4eAy4Pprer\nIOy+R95ZxnMfrWTiuEGMG9ot6jgiIrVK5ZrCa2b2czPrZWZ51V+hJ2tC3luygf95aSHHDe3Gz44Z\nEHUcEZGdSuWawo+C75clLXOgf8PHaXpWbt7BZU98RJ9Orfn9Wbr1VETSWypjNKsjnt1UWlHFjx+f\nTVlljEk/1IVlEUl/qRwpyG5wd657fj7zVhbzwHn5GjVNRDKCnpoKyWPvFvLsR0VceexAjtOFZRHJ\nECoKIVi+oYTf/HMB44Z05cpjvzFmkIhI2qq3KFjcD8zs+mC+t5mNCT9a5rr/zcXkmHHL6QfowrKI\nZJRUjhTuBQ4BzgnmtwL3hJYow63dUsrkWUWckd+Tru3rHYtIRCStpHKheay7H2hmcwDcfVMwvKbU\n4qG3llIZi3HpEbpjV0QyTypHChVmlstXHeJ1AWKhpspQxSUV/PW9Qk4euS99OrWJOo6IyC5LpSj8\nEXge6GpmvwXeAm4JNVWGevTdZWwvr+InGlJTRDJUKg+vPWFms4FjAQNOc/eFoSfLMCXllfzl7aUc\nO7grg7u3jzqOiMhuqbcomNm3gAJ3vyeYb2dmY939/dDTZZC/fbCCTSUV/PRo9W0kIpkrldNH9wHb\nkua3B8skUF4Z48GZSxjbL4+D+uwddRwRkd2WSlEwd/fqmaA7bXWPkeSFOSv5srhURwkikvFSKQpL\nzOwKM2sefF0JLAk7WKaoijn3vbGY4T3ac8TAzlHHERHZI6kUhR8DhwIrgSJgLDAhzFCZ5JX5q1m6\nfjs/PWoAZnp6WUQyWyp3H60Fzm6ELBnH3bl3xiL6d27DCcO6Rx1HRGSPpXL3URfgEqBvcnt3/9HO\n1skWb3y+joJVW7jtjBHkqo8jEWkCUrlg/A9gJvAvoCrcOJnl3hmL2adDK04b1SPqKCIiDSKVotDa\n3X8VepIMM7twIx8s3cj13x1Ki2bqgVxEmoZUPs3+aWYnhZ4kw9w7fTF5bVpw9pheUUcREWkwqRSF\nK4kXhh1mtsXMtprZlrCDpbPPVm/l9U/XcuGhfWndQo9siEjTkcrdR+0aI0gmeWDmEvZqnssPD+kT\ndRQRkQaV0slwM9vbzMaY2RHVXymuN97MPjOzRWZ2TR3tzjAzN7P8VINHZc2WUv4xdyXfO7gXHVtr\nWAkRaVpSuSX1YuKnkHoCc4FvAe8Cx9SzXi7xEdqOI/7Q24dmNsXdF9Ro1w64AsiIDvYeeWcZVTHn\nR4f1izqKiEiDS/WawsFAobsfDYwG1qWw3hhgkbsvcfdy4Cng1Fra/Qa4DShNLXJ0tpVV8sR7hZw4\nfB96d2oddRwRkQaXSlEodfdSADNr6e6fAvunsF4PYEXSfFGwLMHMRgO93P2fKeaN1DMfrmBLaSUX\nf1tHCSLSNKVy60yRmXUEXgBeM7NNwKoU1qvtEd9Eb6tmlgPcBVxQ74bMJhD0t9S7d+8U3rrhVVbF\neOitpYzpm8fo3uoeW0SaplTuPjo9mLzRzKYDHYBXUth2EZB8E39Pvl5M2gHDgRlBR3LdgSlmdoq7\nz6qRYRIwCSA/P9+JwMvzV7Ny8w5uPGVYFG8vItIodloUzKy9u28xs7ykxfOC722BjfVs+0NgoJn1\nI97D6tnA96tfdPdiINHXtJnNAH5esyCkA3dn0ptL6N+5DccO7hp1HBGR0NR1pPAk8F1gNvHTPlbj\ne/+6NuzulWZ2OTANyAUedvcCM7sZmOXuUxogf6N4f+lG5q0s5pbTDyBHHd+JSBO206Lg7t+1+Hmd\nI919+e5s3N2nAlNrLLt+J22P2p33aAwPvLmETm1a8B8HquM7EWna6rz7KBiG8/lGypKWFq2Nd2lx\n3iF9adU8N+o4IiKhSuWW1PfM7ODQk6SpB2cupWWzHHVpISJZIZVbUo8GLjWzQmA7wTUFdx8RarI0\nsHZrKc99tJKzDu5JXht1aSEiTV8qReHE0FOkqcffLaQiFuOiw+u8pi4i0mSk8pxCIYCZdQVahZ4o\nTZSUV/L4e4UcP7Qb/Tq3iTqOiEijqPeagpmdYmZfAEuBN4BlwMsh54rc5NlFbC6pYMIROkoQkeyR\nyoXm3xDvGfVzd+8HHAu8HWqqiFXFnAdnLuXA3h05qE9e/SuIiDQRqRSFCnffAOSYWY67TwdGhZwr\nUu8u3sDyjSW6liAiWSeVC82bzawt8CbwhJmtBSrDjRWtVwq+pHWLXI4doi4tRCS7pHKkcCqwA5hI\nvCO8xcDJYYaKUizmvFqwhqP276KH1UQk69TVId6fgSfd/Z2kxY+GHylac1ZsZu3WMk4Y1j3qKCIi\nja6uI4UvgDvNbJmZ3WpmTfo6QrVpBatpnmscrd5QRSQL7bQouPvd7n4IcCTxbrL/YmYLzex6MxvU\naAkbkbszrWA1h+7XmfatmkcdR0Sk0dV7TcHdC939VncfTXw8hNOBhaEni8Cnq7dSuKGE8cN16khE\nslMqD681N7OTzewJ4g+tfQ78Z+jJIjCtYDVmMG5It6ijiIhEoq4LzccB5wDfAT4AngImuPv2RsrW\n6F6Zv5qD++TRpV3LqKOIiESiriOFXwPvAkPc/WR3f6IpF4TCDdv5dPVWjh+mowQRyV51jbx2dGMG\nidq0gtUAuhVVRLJaKg+vZYVpBWsYtm97euW1jjqKiEhkVBSAtVtKmV24ifE6ShCRLKeiALy6YA0A\nJ+hWVBHJcioKxK8n9O/choFd20YdRUQkUllfFIpLKnh38QaOH9YdM4s6johIpLK+KLz+6RoqY66n\nmEVEUFFgWsFqurdvxYgeHaKOIiISuawuCjvKq3jj83WcMKwbOTk6dSQiktVF4Y3P11FaEdMDayIi\ngawuCtMKVtOxdXPG9MuLOoqISFrI2qJQXhnj9YVrGDekG81ys3Y3iIh8TdZ+Gr63ZANbSiv1FLOI\nSJKsLQrTClbTukUuhw/sHHUUEZG0kZVFoSrmTCtYw9H7d6VV89yo44iIpI2sLApzlm9i/bYyjZ0g\nIlKDuXt4GzcbD9wN5AIPuvvvarx+NXAxUAmsA37k7oV1bTM/P99nzZq1W3n6XvPSTl9b9rvv7NY2\nRUQygZnNdvf8+tqFdqRgZrnAPcCJwFDgHDMbWqPZHCDf3UcAk4HbwsojIiL1C/P00Rhgkbsvcfdy\n4mM8n5rcwN2nu3tJMPse0DPEPCIiUo8wi0IPYEXSfFGwbGcuAl6u7QUzm2Bms8xs1rp16xowooiI\nJAuzKNTWmVCtFzDM7AdAPnB7ba+7+yR3z3f3/C5dujRgRBERSdYsxG0XAb2S5nsCq2o2MrNxwHXA\nke5eFmIeERGpR5hHCh8CA82sn5m1AM4GpiQ3MLPRwP8Bp7j72hCziIhICkIrCu5eCVwOTAMWAs+4\ne4GZ3WxmpwTNbgfaAn83s7lmNmUnmxMRkUYQ5ukj3H0qMLXGsuuTpseF+f4iIrJrsvKJZhERqZ2K\ngoiIJKgoiIhIgoqCiIgkqCiIiEiCioKIiCSoKIiISIKKgoiIJKgoiIhIgoqCiIgkqCiIiEhCqH0f\nZRqN4Swi2U5HCiIikqCiICIiCSoKIiKSoKIgIiIJKgoiIpKgoiAiIgm6JXUX6JZVEWnqVBQakIqG\niGQ6nT4SEZEEFQUREUlQURARkQQVBRERSVBREBGRBBUFERFJ0C2pjWxnt63qllURSQcqCmlGRUNE\noqTTRyIikqCiICIiCTp9lGHqO72U7a+LyJ5RUZAmpb7+p/b0dZGmLtSiYGbjgbuBXOBBd/9djddb\nAo8BBwEbgO+5+7IwM4nsCRUVaepCu6ZgZrnAPcCJwFDgHDMbWqPZRcAmdx8A3AXcGlYeERGpX5hH\nCmOARe6+BMDMngJOBRYktTkVuDGYngz82czM3T3EXCKR0ektSXdhFoUewIqk+SJg7M7auHulmRUD\nnYD1IeYSabJSKRq6mC91sbD+KDezM4ET3P3iYP6HwBh3/1lSm4KgTVEwvzhos6HGtiYAE4LZ/YHP\nGihmZ9K7ACnfnkn3fJD+GZVvz6RTvj7u3qW+RmEeKRQBvZLmewKrdtKmyMyaAR2AjTU35O6TgEkN\nHdDMZrl7fkNvt6Eo355J93yQ/hmVb8+ke77ahPnw2ofAQDPrZ2YtgLOBKTXaTAHOD6bPAP6t6wki\nItEJ7UghuEZwOTCN+C2pD7t7gZndDMxy9ynAQ8DjZraI+BHC2WHlERGR+oX6nIK7TwWm1lh2fdJ0\nKXBmmBnq0eCnpBqY8u2ZdM8H6Z9R+fZMuuf7htAuNIuISOZRh3giIpKQlUXBzMab2WdmtsjMrok6\nT01mtszM5pnZXDObFXUeADN72MzWmtn8pGV5ZvaamX0RfN87zfLdaGYrg/0418xOijBfLzObbmYL\nzazAzK4MlqfFPqwjXzrtw1Zm9oGZfRxkvClY3s/M3g/24dPBjS3plO8RM1uatA9HRZEvVVl3+ijo\nfuNz4Djit8R+CJzj7gvqXLERmdkyIN/d0+X+ZszsCGAb8Ji7Dw+W3QZsdPffBcV1b3f/VRrluxHY\n5u53RJEpmZntA+zj7h+ZWTtgNnAacAFpsA/ryHcW6bMPDWjj7tvMrDnwFnAlcDXwnLs/ZWb3Ax+7\n+31plO/HwD/dfXJjZ9od2XgxL24rAAAFyUlEQVSkkOh+w93LgeruN6QO7v4m33yG5FTg0WD6UeIf\nIpHYSb604e5fuvtHwfRWYCHxJ/rTYh/WkS9teNy2YLZ58OXAMcS7yYFo9+HO8mWUbCwKtXW/kVY/\n/MR/kF41s9nB09zpqpu7fwnxDxWga8R5anO5mX0SnF6K7PRWMjPrC4wG3icN92GNfJBG+9DMcs1s\nLrAWeA1YDGx298qgSaS/zzXzuXv1PvxtsA/vCnqHTlvZWBSslmXpVs0Pc/cDifcwe1lwakR23X3A\nfsAo4EvgzmjjgJm1BZ4FrnL3LVHnqamWfGm1D929yt1HEe8hYQwwpLZmjZsq6Y1r5DOz4cC1wGDg\nYCAPiOQUa6qysSik0v1GpNx9VfB9LfA88R/+dLQmOBddfU56bcR5vsbd1wS/pDHgASLej8F55meB\nJ9z9uWBx2uzD2vKl2z6s5u6bgRnAt4COQTc5kCa/z0n5xgen5tzdy4C/kCb7cGeysSik0v1GZMys\nTXChDzNrAxwPzK97rcgkd1NyPvCPCLN8Q/WHbeB0ItyPwUXIh4CF7v77pJfSYh/uLF+a7cMuZtYx\nmN4LGEf82sd04t3kQLT7sLZ8nyYVfSN+vSNdf5+BLLz7CCC4re4PfNX9xm8jjpRgZv2JHx1A/Inz\nJ9Mhn5n9DTiKeK+Pa4AbgBeAZ4DewHLgTHeP5GLvTvIdRfy0hwPLgEurz99HkO9wYCYwD4gFi39N\n/Lx95PuwjnznkD77cATxC8m5xP+gfcbdbw5+Z54ifmpmDvCD4K/ydMn3b6AL8VPXc4EfJ12QTjtZ\nWRRERKR22Xj6SEREdkJFQUREElQUREQkQUVBREQSVBRERCRBRUEykplVBT1Ozjezv5tZ62B5dzN7\nyswWm9kCM5tqZoOizhs2M7uqeh+I7AkVBclUO9x9VNAjajnw4+DhoOeBGe6+n7sPJX6vfbcogzaS\nqwAVBdljKgrSFMwEBgBHAxXufn/1C+4+191n1lzBzM4LOij72MweD5b1MbPXg+Wvm1nvYPkjZnaf\nxccbWGJmRwadwy00s0eStrnNzO40s4+C9bsEy0eZ2XvBdp+v7lTOzGaY2a0W74P/czP7drA818xu\nN7MPg3UuDZYfFawz2cw+NbMnLO4KYF9gepAxN8g83+LjckwMZ7dLU6SiIBkt6PPmROJP4g4nPg5A\nfesMA64DjnH3kcT7vAf4M/HxGEYATwB/TFptb+JdNE8EXgTuAoYBB9hXg6a0AT4KOjN8g/hT1QCP\nAb8KtjsvaTlAM3cfQ/wv/erlFwHF7n4w8U7ULjGzfsFro4O2Q4H+xDtP/CPx/n6OdvejiT+B3MPd\nh7v7AcT72xFJiYqCZKq9gi6KZxHvHuKhXVj3GGBy9SBGSd1KHAI8GUw/DhyetM6LHn/8fx6wxt3n\nBZ3EFQB9gzYx4Olg+q/A4WbWAejo7m8Eyx8Fknu9re4Yb3bSdo4Hzgv+fe8DnYCBwWsfuHtR8N5z\nk9ZJtgTob2Z/MrPxQNr1xirpq1n9TUTS0o6gi+IEMyvgq47R6mKk1r1ycpvqvnRiSdPV8zv7PUrl\nPaq3VZW0HQN+5u7Tkhua2VE13jt5na/e1H2TmY0ETgAuIz562o9SyCKiIwVpUv4NtDSzS6oXmNnB\nZnZkjXavA2eZWaegTV6w/B3iveYCnEt8OMVdkcNXRen7wFvuXgxsqr5eAPyQ+KmlukwDfhJ0ZY2Z\nDQp6zK3LVqC6d93OQI67Pwv8P+DAXfx3SBbTkYI0Ge7uZnY68AeLj3dcSrxnz6tqtCsws98Cb5hZ\nFfGeNS8ArgAeNrNfAOuAC3cxwnZgmJnNBoqB7wXLzwfuD24ZXZLCdh8kflroo+COqnXUP8TkJOBl\nM/uS+L/3L2ZW/Ufftbv475Aspl5SRRqImW1z97ZR5xDZEzp9JCIiCTpSEBGRBB0piIhIgoqCiIgk\nqCiIiEiCioKIiCSoKIiISIKKgoiIJPx/1Xg3Wj+JtqkAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1a1221b9e8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.bar(range(X_train_std.shape[1]), pca.explained_variance_ratio_)\n",
    "ax.plot(range(X_train_std.shape[1]), np.cumsum(pca.explained_variance_ratio_))\n",
    "plt.xlabel(\"PC components\")\n",
    "plt.ylabel(\"Variance retention\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>retention</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.218011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.340155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.425990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.508032</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.556731</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.597943</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.627196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.655609</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.683652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.710830</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   retention\n",
       "0   0.218011\n",
       "1   0.340155\n",
       "2   0.425990\n",
       "3   0.508032\n",
       "4   0.556731\n",
       "5   0.597943\n",
       "6   0.627196\n",
       "7   0.655609\n",
       "8   0.683652\n",
       "9   0.710830"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\"retention\": np.cumsum(pca.explained_variance_ratio_)})\\\n",
    "    .query(\"retention>0.99\")[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 45.5 s, sys: 21.5 s, total: 1min 7s\n",
      "Wall time: 1min\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "pca = decomposition.PCA(random_state=1, n_components=23)\n",
    "X_train_pca = pca.fit_transform(X_train_std)\n",
    "X_test_pca = pca.transform(X_test_std)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((3428901, 23), (1469530, 23), (3428901,), (1469530,))"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_pca.shape, X_test_pca.shape, y_train.shape, y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training accuracy: 0.9972489727758252 Test accuracy: 0.9971732458677264\n",
      "CPU times: user 4min 23s, sys: 9.59 s, total: 4min 32s\n",
      "Wall time: 4min 37s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "est = linear_model.LogisticRegression()\n",
    "est.fit(X_train_std, y_train)\n",
    "print(\"Training accuracy:\", est.score(X_train_std, y_train),\n",
    "    \"Test accuracy:\", est.score(X_test_std, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training accuracy: 0.9973606703722271 Test accuracy: 0.9972794022578647\n",
      "CPU times: user 3min 2s, sys: 4.94 s, total: 3min 7s\n",
      "Wall time: 3min 8s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "est = linear_model.LogisticRegression()\n",
    "est.fit(X_train_pca, y_train)\n",
    "print(\"Training accuracy:\", est.score(X_train_pca, y_train),\n",
    "    \"Test accuracy:\", est.score(X_test_pca, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
